import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}

all_poetry = []
count = 0
urls = [f'https://www.gushiwen.cn/default_{i}.aspx' for i in range(1, 6)]
for url in urls:
    html_response = requests.get(url, headers=headers)
    html_content = html_response.content.decode('utf8')
    # print(html_content)
    # <b>长安秋望</b>
    titles = re.findall(r'<b>(.*?)</b>', html_content, re.DOTALL)
    # print(titles)
    # <p class="source"><a href="https://so.gushiwen.cn/authorv.aspx?name=%e6%9d%9c%e7%89%a7" target="_blank">杜牧</a><a href="https://so.gushiwen.cn/shiwens/default.aspx?cstr=%e5%94%90%e4%bb%a3" target="_blank">〔唐代〕</a></p>
    # 作者和朝代
    temp_dynasties = re.findall('<p class="source">.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>', html_content, re.DOTALL)
    dynasties = []
    authors = []
    poem_list = []
    for temp_dy in temp_dynasties:
        dynasties.append(temp_dy[1])
        authors.append(temp_dy[0])
    poems = re.findall(r'<div class="contson".*?>(.*?)</div>', html_content, re.DOTALL)
    for temp_p in poems:
        new_poem = re.sub(r'\s<.*?>', '', temp_p).strip()
        poem_list.append(new_poem)
    for title, dynasty, author, poem in zip(titles, dynasties, authors, poem_list):
        temp = {
            'title': title,
            'dynasty': dynasty,
            'author': author,
            'poem': poem,
        }
        all_poetry.append(temp)
    count = count +1
    print('第{}页已经爬完'.format(count))
print(all_poetry)